# Preliminary ----

if (!require(pacman)) install.packages("pacman") # install pacman to install any required packages

pacman::p_load(tidyverse, labelled, data.table, haven, conflicted)

conflict_prefer("filter", "dplyr")
conflict_prefer("select", "dplyr")

basedir <- normalizePath(file.path(here::here("..")))

setwd(basedir)

groupappeals_path <- "data/groupappeals.rds"
BESIP_path <- "data/BESIP_clean.rds"
BES_path <- "data/BESrounds.rds"
vparty_path <- "data/V-Dem-CPD-Party-V2.rds"

# load appeals dataset
aga <- read_rds(groupappeals_path)

## Construct variable capturing explicit policy mentions ----

# dictionary
policydictionary <- "bill|legislat|policy|polici|law|amendment"

# vector of policy mentions in sentences
policymentionsvec <- str_detect(tolower(aga$text),policydictionary)

# count
table(policymentionsvec) # 38527/500336 = 7.7%

# check face validity; examples for appendix
sample(aga$text[policymentionsvec],20)
sample(aga$text[!policymentionsvec],20)

# merge in
aga$policymention <- as.numeric(policymentionsvec)

## Construct NV variable ----

aga2 <- aga %>% 
  transmute(date,party,group,
            nv=probpositive-probnegative,
            text,
            policymention)

# face validity
aga2 %>% 
  arrange(-nv) %>% 
  slice_head(n=3) %>% 
  pull(text)

aga2 %>% 
  arrange(nv) %>% 
  slice_head(n=3) %>% 
  pull(text)

## Relabel parties ----
aga2 <- aga2 %>%
  mutate(party=case_when(party=="Conservative Party" ~ "Conservatives",
                         party=="Labour Party" ~ "Labour",
                         party=="Scottish National Party" ~ "SNP",
                         TRUE ~ party))

# Function that constructs exposure variable by calculating recent n and nv by dyad ----

recentdyadnv <- function(dparty, dgroup, ddate, dwindow) {
  
  # subset to relevant subset
  subaga <- aga2_dt[.(dparty, dgroup, seq(from = ddate - days(dwindow), to = ddate, by = "day")), nomatch = 0L]
  
  # subset further where policymention == 0
  subaga_nopolicy <- subaga[policymention == 0]
  subaga_policy <- subaga[policymention == 1]
  
  # find nv for all sentences
  nv_all <- sum(subaga$nv, na.rm = TRUE)
  
  # find n for all sentences
  n_all <- length(subaga$nv[!is.na(subaga$nv)])
  
  # find nv for sentences with policymention == 0
  nv_nopolicy <- sum(subaga_nopolicy$nv, na.rm = TRUE)
  
  # find n for sentences with policymention == 0
  n_nopolicy <- length(subaga_nopolicy$nv[!is.na(subaga_nopolicy$nv)])
  
  # find nv for sentences with policymention == 1
  nv_policy <- sum(subaga_policy$nv, na.rm = TRUE)
  
  # find n for sentences with policymention == 1
  n_policy <- length(subaga_policy$nv[!is.na(subaga_policy$nv)])
  
  return(list(sum_all = nv_all, n_all = n_all, sum_policy = nv_policy, n_policy = n_policy, sum_nopolicy = nv_nopolicy, n_nopolicy = n_nopolicy))
}

# set keys
aga2_dt <- as.data.table(aga2)

setkey(aga2_dt, party, group, date)

# Construct exposure variables for BESIP ----

# load survey data
sd_pa <- read_rds(BESIP_path)

# create date variable
sdpmap <- sd_pa %>% 
  transmute(dparty=party,dgroup=group,ddate=ymd(as.Date(starttime)), dwindow=90)

# run function on all rows
pmapout <- pmap(sdpmap,recentdyadnv)

# convert lists to vectors
sum_all_vector <- purrr::map_dbl(pmapout, "sum_all")
n_all_vector <- purrr::map_dbl(pmapout, "n_all")
sum_policy_vector <- purrr::map_dbl(pmapout, "sum_policy")
n_policy_vector <- purrr::map_dbl(pmapout, "n_policy")
sum_nopolicy_vector <- purrr::map_dbl(pmapout, "sum_nopolicy")
n_nopolicy_vector <- purrr::map_dbl(pmapout, "n_nopolicy")

# add the new vectors as columns in your survey data
sd_pa <- sd_pa %>% 
  mutate(nvsum = sum_all_vector, 
         nvn = n_all_vector, 
         nvsum_policy = sum_policy_vector, 
         nvn_policy = n_policy_vector,
         nvsum_nopolicy = sum_nopolicy_vector,
         nvn_nopolicy = n_nopolicy_vector)

# Construct exposure variables for BES rounds ----

# load survey data
sd_cs <- read_rds(BES_path)

# dates go further in sd_cs than in aga2 - so we filter to avoid NAs causing trouble
end_date <- max(aga2$date, na.rm = TRUE)
sd_cs <- sd_cs %>% filter(date <= end_date)

sdpmap <- sd_cs %>% 
  transmute(dparty=party,dgroup=group,ddate=date, dwindow=90)

# run function on all rows
pmapout <- pmap(sdpmap,recentdyadnv)

# convert lists to vectors
sum_all_vector <- purrr::map_dbl(pmapout, "sum_all")
n_all_vector <- purrr::map_dbl(pmapout, "n_all")
sum_policy_vector <- purrr::map_dbl(pmapout, "sum_policy")
n_policy_vector <- purrr::map_dbl(pmapout, "n_policy")
sum_nopolicy_vector <- purrr::map_dbl(pmapout, "sum_nopolicy")
n_nopolicy_vector <- purrr::map_dbl(pmapout, "n_nopolicy")

sd_cs <- sd_cs %>% 
  mutate(nvsum = sum_all_vector, 
         nvn = n_all_vector, 
         nvsum_policy = sum_policy_vector, 
         nvn_policy = n_policy_vector,
         nvsum_nopolicy = sum_nopolicy_vector,
         nvn_nopolicy = n_nopolicy_vector)

# Robustness: construct exposure variables for variable windows ----

# construct simpler version of recentdyadnv without policy distinction (to save time)
recentdyadnv_simple <- function(dparty, dgroup, ddate, dwindow) {
  
  # subset to relevant subset
  subaga <- aga2_dt[.(dparty, dgroup, seq(from = ddate - days(dwindow), to = ddate, by = "day")), nomatch = 0L]
  
  # find nv for all sentences
  nv_all <- sum(subaga$nv, na.rm = TRUE)
  
  # find n for all sentences
  n_all <- length(subaga$nv[!is.na(subaga$nv)])
  
  return(list(sum_all = nv_all, n_all = n_all))
}

## BESIP ----

# create new date variables

## 60 day window
sdpmap60 <- sd_pa %>% 
  transmute(dparty=party,dgroup=group,ddate=ymd(as.Date(starttime)), dwindow=60)

## 75 day window
sdpmap75 <- sd_pa %>% 
  transmute(dparty=party,dgroup=group,ddate=ymd(as.Date(starttime)), dwindow=75)

## 105 day window
sdpmap105 <- sd_pa %>% 
  transmute(dparty=party,dgroup=group,ddate=ymd(as.Date(starttime)), dwindow=105)

## 120 day window
sdpmap120 <- sd_pa %>% 
  transmute(dparty=party,dgroup=group,ddate=ymd(as.Date(starttime)), dwindow=120)

# run functions on all rows
pmapout60 <- pmap(sdpmap60,recentdyadnv_simple)
pmapout75 <- pmap(sdpmap75,recentdyadnv_simple)
pmapout105 <- pmap(sdpmap105,recentdyadnv_simple)
pmapout120 <- pmap(sdpmap120,recentdyadnv_simple)

# convert lists to vectors
sum_all_vector_60 <- purrr::map_dbl(pmapout60, "sum_all")
n_all_vector_60 <- purrr::map_dbl(pmapout60, "n_all")

sum_all_vector_75 <- purrr::map_dbl(pmapout75, "sum_all")
n_all_vector_75 <- purrr::map_dbl(pmapout75, "n_all")

sum_all_vector_105 <- purrr::map_dbl(pmapout105, "sum_all")
n_all_vector_105 <- purrr::map_dbl(pmapout105, "n_all")

sum_all_vector_120 <- purrr::map_dbl(pmapout120, "sum_all")
n_all_vector_120 <- purrr::map_dbl(pmapout120, "n_all")

# add the new vectors as columns in your survey data
sd_pa <- sd_pa %>% 
  mutate(nvsum60 = sum_all_vector_60, 
         nvn60 = n_all_vector_60,
         nvsum75 = sum_all_vector_75, 
         nvn75 = n_all_vector_75,
         nvsum105 = sum_all_vector_105, 
         nvn105 = n_all_vector_105,
         nvsum120 = sum_all_vector_120, 
         nvn120 = n_all_vector_120)

## BES rounds ----

# create new date variables

## 60 day window
sdpmap60 <- sd_cs %>% 
  transmute(dparty=party,dgroup=group,ddate=date, dwindow=60)

## 75 day window
sdpmap75 <- sd_cs %>% 
  transmute(dparty=party,dgroup=group,ddate=date, dwindow=75)

## 105 day window
sdpmap105 <- sd_cs %>% 
  transmute(dparty=party,dgroup=group,ddate=date, dwindow=105)

## 120 day window
sdpmap120 <- sd_cs %>% 
  transmute(dparty=party,dgroup=group,ddate=date, dwindow=120)

#run function on all rows
pmapout60 <- pmap(sdpmap60,recentdyadnv_simple)
pmapout75 <- pmap(sdpmap75,recentdyadnv_simple)
pmapout105 <- pmap(sdpmap105,recentdyadnv_simple)
pmapout120 <- pmap(sdpmap120,recentdyadnv_simple)

# convert lists to vectors
sum_all_vector_60 <- purrr::map_dbl(pmapout60, "sum_all")
n_all_vector_60 <- purrr::map_dbl(pmapout60, "n_all")

sum_all_vector_75 <- purrr::map_dbl(pmapout75, "sum_all")
n_all_vector_75 <- purrr::map_dbl(pmapout75, "n_all")

sum_all_vector_105 <- purrr::map_dbl(pmapout105, "sum_all")
n_all_vector_105 <- purrr::map_dbl(pmapout105, "n_all")

sum_all_vector_120 <- purrr::map_dbl(pmapout120, "sum_all")
n_all_vector_120 <- purrr::map_dbl(pmapout120, "n_all")

sd_cs <- sd_cs %>% 
  mutate(nvsum60 = sum_all_vector_60, 
         nvn60 = n_all_vector_60,
         nvsum75 = sum_all_vector_75, 
         nvn75 = n_all_vector_75,
         nvsum105 = sum_all_vector_105, 
         nvn105 = n_all_vector_105,
         nvsum120 = sum_all_vector_120, 
         nvn120 = n_all_vector_120)

# Merge BESIP and BES rounds dfs ----

# create unique, i.e. non-overlapping id's before merging
max_id <- max(sd_pa$id)
sd_cs$id <- sd_cs$id + max_id

# make them compatible before merging

## construct single date variable in sd_pa like in sd_cs (and use to replace starttime, endtime, wavemonth)
sd_pa$date <- as.Date(sd_pa$starttime)

sd_pa <- sd_pa |>
  select(id, wave, date, party, group, lookafter, infoTV, infoPaper, infoRadio, infoInternet, infoPeople, 
         nvsum, nvn, 
         nvsum_policy, nvn_policy, nvsum_nopolicy, nvn_nopolicy, 
         nvsum60, nvn60, nvsum75, nvn75, nvsum105, nvn105, nvsum120, nvn120)

sd_pa$BES <- "BESIP"
sd_cs$BES <- "BES"
sd_cs$wave <- lubridate::year(sd_cs$date)

sd_cs$infoTV <- NA_real_
sd_cs$infoPaper <- NA_real_
sd_cs$infoRadio <- NA_real_
sd_cs$infoInternet <- NA_real_
sd_cs$infoPeople <- NA_real_

## convert to numeric - currently they're in different 'labelled' formats so bind not possible
sd_pa$lookafter[1:5]
remove_labels(sd_pa$lookafter[1:5])

sd_pa$lookafter <- remove_labels(sd_pa$lookafter)
sd_cs$lookafter <- remove_labels(sd_cs$lookafter)

# bind rows
sd_full <- rbind(sd_pa, sd_cs)

## Clean up ----

#remove non-changes in net valence
table(sd_full$nvsum==0)
sd_full_no0 <- sd_full %>% filter(nvsum!=0)

# create version of lookafter (1-100 scale) converting missing to NA_real_
sd_full$lookafter <- unclass(sd_full$lookafter)

sd_full <- sd_full |>
  mutate(lookafter_na = ifelse(lookafter==9999,NA_real_,lookafter))

sd_full <- sd_full |>
  mutate(lookafter_na_100 = (lookafter_na-1)*(100/3))

sd_full <- sd_full |>
  select(-lookafter_na) # won't be using this one

# Add party position variables ----
vparty <- read_rds(vparty_path)

# clean and filter vparty
vparty <- vparty |>
  filter(country_name=="United Kingdom") |>
  filter(v2paenname %in% c("Conservatives", "Labour", "Liberal Democrats", "Liberal Party", "Scottish National Party")) |>
  filter(year>=1997) |>
  mutate(v2paenname = ifelse(v2paenname=="Liberal Party", "Liberal Democrats", v2paenname),
         v2paenname = ifelse(v2paenname=="Scottish National Party", "SNP", v2paenname)) |>
  rename("party" = "v2paenname")

# define government periods
govtperiods <- tibble(
  start_date = as.Date(c("2019-12-13", "2017-06-09", "2015-05-08", "2010-05-12", 
                         "2005-05-06", "2001-07-07", "1997-05-02")),
  end_date = as.Date(c("2023-12-31", "2019-12-12", "2017-06-08", "2015-05-07", 
                       "2010-05-11", "2005-05-05", "2001-07-06")))

# create expanded party-period dataset
parties <- unique(vparty$party)
govtperiods_expanded <- govtperiods %>%
  tidyr::crossing(party = parties) %>%
  mutate(start_year = lubridate::year(start_date))

# join vparty variables to govtperiods_expanded
govtperiods_expanded <- govtperiods_expanded %>%
  left_join(vparty, by = c("party", "start_year" = "year"))

# function to get vparty variables for sd_full
get_vparty_values <- function(party, date, variable) {
  match_row <- which((govtperiods_expanded$party == party) & 
                       (govtperiods_expanded$start_date <= date) & 
                       (govtperiods_expanded$end_date >= date))
  
  if (length(match_row) == 1) {
    return(govtperiods_expanded[[variable]][match_row])
  } else {
    return(NA)
  }
}

# apply function to sd_full
sd_full <- sd_full %>%
  mutate(
    v2paculsup = mapply(get_vparty_values, party, date, MoreArgs = list(variable = "v2paculsup")),
    v2pawomlab = mapply(get_vparty_values, party, date, MoreArgs = list(variable = "v2pawomlab")),
    v2pariglef = mapply(get_vparty_values, party, date, MoreArgs = list(variable = "v2pariglef")),
    v2pawelf = mapply(get_vparty_values, party, date, MoreArgs = list(variable = "v2pawelf"))
  )

sd_full <- sd_full |>
  rename("partypos_cultdom" = "v2paculsup") |>
  rename("partypos_women" = "v2pawomlab") |>
  rename("partypos_leftright" = "v2pariglef") |>
  rename("partypos_welfare" = "v2pawelf")

# Save ----
write_rds(sd_full,"data/fulldf.rds")
